[机器学习

您所在的位置:网站首页 imdb 数据分析 [机器学习

[机器学习

2023-08-18 02:20| 来源: 网络整理| 查看: 265

一, 前言

1.Imdb影评的数据集介绍与下载 2.贝叶斯原理介绍 3.TF-IDF是什么

二,代码介绍 preprocess_data() 方法包含以下步骤: 加载Imdb影评数据。用数据训练CountVectorizer或者TfidfVectorizer。保持CountVectorizer与TfidfVectorizer模型,因为预测的时候要用这两个模型把要预测的词转化为词向量。保持转换为词向量的数据集。 def preprocess_data(): X_orig, Y_orig = get_data(aclImdb_train_dir) X_orig_test, Y_orig_test = get_data(aclImdb_test_dir) X_orig = np.concatenate([X_orig, X_orig_test]) Y_orig = np.concatenate([Y_orig, Y_orig_test]) cv = CountVectorizer(max_features=vocab_size) tfidf = TfidfVectorizer(max_features=vocab_size) cv.fit(X_orig) tfidf.fit(X_orig) print(cv.vocabulary_) # 3. 4.transform training data into a 'document-term matrix' (which is a sparse matrix) use “transform()” train_data = cv.transform(X_orig) tfidf_train_data = tfidf.transform(X_orig) # (the index of the list , the index of the dict ) the frequency of the list[index] print(cv.get_feature_names()) print(train_data) train_data = train_data.toarray() tfidf_train_data = tfidf_train_data.toarray() print(train_data) joblib.dump(cv, "data/CountVectorizer.joblib") joblib.dump(tfidf, "data/TfidfVectorizer.joblib") np.savez(r'data\CountVectorizer_trainData', x=train_data, y=Y_orig) np.savez(r'data\TfidfVectorizer_trainData', x=tfidf_train_data, y=Y_orig)

运行后会生成下面四个文件 在这里插入图片描述

train_my_module()方法: 加载已经生成的词向量。训练你MultinomialNB贝叶斯模型。测试你的模型 def train_my_module(is_tfidf): if is_tfidf: trainDataNew = np.load(r'data/TfidfVectorizer_trainData.npz') else: trainDataNew = np.load('data/CountVectorizer_trainData.npz') x = trainDataNew['x'] y = trainDataNew['y'] x_train,x_test,y_train,y_test=train_test_split(x, y, test_size=0.3) x_train,x_test,y_train,y_test=np.array(x_train),np.array(x_test),np.array(y_train),np.array(y_test) print(x_train.shape,x_test.shape) #训练数据 module=MultinomialNB() module.fit(x_train,y_train) #测试数据 y_pred=module.predict(x_test) if is_tfidf: joblib.dump(module, r'data/Tfidf_bayes_module.joblib') else: joblib.dump(module, r'data/bayes_module.joblib') #输出 print("正确值:{0}".format(y_test)) print("预测值:{0}".format(y_pred)) print("准确率:%f%%"%(accuracy_score(y_test, y_pred)*100)) predict_my_module()方法 加载训练好的贝叶斯模型加载训练好的词向量模型把要预测的数据转为词向量预测 def predict_my_module(is_tfidf): if is_tfidf: model = joblib.load(r'data/Tfidf_bayes_module.joblib') else: model = joblib.load(r'data/bayes_module.joblib') #neg:0 postive:1 review =["the character is so poorly written.", "this is bad movie ", "I'm not very disappoint for this movie", "I'm very happy for this movie" ] if is_tfidf: cv = joblib.load(r'data/TfidfVectorizer.joblib') else: cv = joblib.load(r'data/CountVectorizer.joblib') train_data = cv.transform(review) train_data = train_data.toarray() s = model.predict(train_data) print(s) 三,CountVectorizer与TfidfVectorizer的结果对比

如果is_tfidf为False,那么就是用CountVectorizer生成的词向量训练的模型 如果is_tfidf为True,那么就是用TfidfVectorizer生成的词向量训练的模型

if __name__ == '__main__': preprocess_data() is_tfidf =True train_my_module(is_tfidf) #predict_my_module(is_tfidf)

运行结果如下,可知TfidfVectorizer 生成的词向量准确率最高。

Count VectorizerTfidf Vectorizer准确率:84.326667%准确率: 85.893333% 四,情感分析的完整代码如下 import numpy as np from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split from sklearn.naive_bayes import MultinomialNB import numpy as np import re from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer import os as os import joblib vocab_size = 30000 aclImdb_train_dir = r'D:\train_data\aclImdb\aclImdb\train' aclImdb_test_dir = r'D:\train_data\aclImdb\aclImdb\test' # remove html tag like '' def rm_tags(text): re_tag = re.compile(r']+>') return re_tag.sub(' ', text) def clean_str(string): return string.strip().lower() def process(text): text = clean_str(text) text = rm_tags(text) return text def get_data(datapath): pos_files = os.listdir(datapath + '/pos') neg_files = os.listdir(datapath + '/neg') print(len(pos_files)) print(len(neg_files)) pos_all = [] neg_all = [] for pf, nf in zip(pos_files, neg_files): with open(datapath + '/pos' + '/' + pf, encoding='utf-8') as f: s = f.read() s = process(s) pos_all.append(s) with open(datapath + '/neg' + '/' + nf, encoding='utf-8') as f: s = f.read() s = process(s) neg_all.append(s) X_orig= np.array(pos_all + neg_all) Y_orig = np.array([1 for _ in range(len(pos_all))] + [0 for _ in range(len(neg_all))]) return X_orig, Y_orig def preprocess_data(): X_orig, Y_orig = get_data(aclImdb_train_dir) X_orig_test, Y_orig_test = get_data(aclImdb_test_dir) X_orig = np.concatenate([X_orig, X_orig_test]) Y_orig = np.concatenate([Y_orig, Y_orig_test]) cv = CountVectorizer(max_features=vocab_size) tfidf = TfidfVectorizer(max_features=vocab_size) cv.fit(X_orig) tfidf.fit(X_orig) print(cv.vocabulary_) # 3. 4.transform training data into a 'document-term matrix' (which is a sparse matrix) use “transform()” train_data = cv.transform(X_orig) tfidf_train_data = tfidf.transform(X_orig) # (the index of the list , the index of the dict ) the frequency of the list[index] print(cv.get_feature_names()) print(train_data) train_data = train_data.toarray() tfidf_train_data = tfidf_train_data.toarray() print(train_data) joblib.dump(cv, "data/CountVectorizer.joblib") joblib.dump(tfidf, "data/TfidfVectorizer.joblib") np.savez(r'data\CountVectorizer_trainData', x=train_data, y=Y_orig) np.savez(r'data\TfidfVectorizer_trainData', x=tfidf_train_data, y=Y_orig) def train_my_module(is_tfidf): if is_tfidf: trainDataNew = np.load(r'data/TfidfVectorizer_trainData.npz') else: trainDataNew = np.load('data/CountVectorizer_trainData.npz') x = trainDataNew['x'] y = trainDataNew['y'] x_train,x_test,y_train,y_test=train_test_split(x, y, test_size=0.3) x_train,x_test,y_train,y_test=np.array(x_train),np.array(x_test),np.array(y_train),np.array(y_test) print(x_train.shape,x_test.shape) #训练数据 module=MultinomialNB() module.fit(x_train,y_train) #测试数据 y_pred=module.predict(x_test) if is_tfidf: joblib.dump(module, r'data/Tfidf_bayes_module.joblib') else: joblib.dump(module, r'data/bayes_module.joblib') #输出 print("正确值:{0}".format(y_test)) print("预测值:{0}".format(y_pred)) print("准确率:%f%%"%(accuracy_score(y_test, y_pred)*100)) def predict_my_module(is_tfidf): if is_tfidf: model = joblib.load(r'data/Tfidf_bayes_module.joblib') else: model = joblib.load(r'data/bayes_module.joblib') #neg:0 postive:1 review =["the character is so poorly written.", "this is bad movie ", "I'm not very disappoint for this movie", "I'm very happy for this movie" ] if is_tfidf: cv = joblib.load(r'data/TfidfVectorizer.joblib') else: cv = joblib.load(r'data/CountVectorizer.joblib') train_data = cv.transform(review) train_data = train_data.toarray() s = model.predict(train_data) print(s) if __name__ == '__main__': preprocess_data() is_tfidf =True train_my_module(is_tfidf) #predict_my_module(is_tfidf)


【本文地址】


今日新闻


推荐新闻


CopyRight 2018-2019 办公设备维修网 版权所有 豫ICP备15022753号-3